********************************************************************************
* 
* Merging the DATA 

********************************************************************************

clear all 

global data "E:\Data"


********************
* 	REFERRALS 
********************

****Case Referral Data 
import excel "$data\Raw\allegations_feb2020\Referrals 2008-2018", sheet("Data") firstrow case(lower) clear  

duplicates report refer_id //unique by refer_id 
tab ref_type, m 
gen ref_type_cleaned=ref_type 
	replace ref_type_cleaned="Unknown"  if ref_type=="" | ref_type=="NONE" 
	replace ref_type_cleaned="CPS"  if ref_type=="CPS, GPS"
	replace ref_type_cleaned="GPS"  if ref_type=="GPS, NONE"
tab ref_type_cleaned, m

*replace one case where refer_id was inputted as cas_id 
replace cas_id=. if cas_id==503656

*creating cleaned mandated reporter variable
preserve
import excel using "$data\Mandated Reporter Groupings", firstrow clear 
tempfile mndt 
save "`mndt'"
restore 
replace relationship_to_report=strtrim(relationship_to_report)
merge m:1 relationship_to_report using "`mndt'" ,gen(_mergemndt)
drop _mergemndt 
replace reporter_groups=strtrim(reporter_groups)
replace reporter_groups="other reporter" if reporter_groups=="other government" 

tab call_scrn_outcome,m //281 missing 
	gen screenout=(substr(call_scrn_outcome,1,10)=="Screen Out") 
	replace screenout=1 if call_scrn_outcome=="Field Screen Out" 
	replace screenout=0 if call_scrn_outcome=="" & service_decision!=""
	label var screenout "Referral Screened Out"
	gen screenin=1-screenout 
	label var screenin "Referral Screened In"
	
	gen accept=(service_decision=="Accept For Service"|service_decision=="Connect to a Closed Family Services/ JPO/ IL Case and Re-Open the Case"|service_decision=="Connect to an Open Family Services/ JPO/ IL Case") 
	label var accept "Accept for service"

	
********************************
* 	EXPUNGEMENT FLAG 
********************************	
preserve 
	import excel using "$data\Raw\MPdata\Referrals 2008-2018 Expungements", firstrow case(l) clear 
	tempfile expu 
save "`expu'"
restore 

merge 1:1 refer_id using "`expu'"
drop _merge 
tab ref_type expunged 
	
********************************
* 	INVESTIGATION CLOSE DATES 
********************************

**** Merging Updated Investigation Close Date 
rename invest_close_dt original_invest_close_dt 
preserve 
import excel using "$data\Raw\MPdata\Updated Referral Investgation Closure Dates", firstrow case(l) clear 
tempfile data 
save "`data'"
restore 
merge 1:1 refer_id using "`data'"
drop _merge 
count if invest_close_dt!=original_invest_close_dt //55 cases, all of which were from investigations that closed in 2019. So not to be worried about. 
drop invest_close_dt 
list *invest* if original_invest_close_dt<initial_invest_close_dt & initial_invest_close_dt<. //one case not to be worried about (only 1day of difference)
label var original_invest_close_dt "Investigation date in initial data sent by data partner"
label var initial_invest_close_dt "Investigation date sent by data partner as the corrected date"

*Creating and Formatting date variables 
foreach x in original_invest_close_dt initial_invest_close_dt intake_dt { 
gen `x'_mdy=dofc(`x')
format `x'_mdy %td 
}
gen earliest_invest_close_dt=min(original_invest_close_dt_mdy,initial_invest_close_dt_mdy)
format earliest_invest_close_dt %td
label var earliest_invest_close_dt "Earliest Investigation Close Date"
gen intake_dt_my=mofd(intake_dt_mdy)
format intake_dt_my %tm
gen intake_dt_y=year(intake_dt_mdy)




**********************************************
* 	INVESTIGATION CASEWORKER ASSIGNMENTS  
**********************************************

preserve 
	import excel "$data\Raw\MPdata\Assignment_file_2008_2018_Updated", sheet("Sheet1") firstrow case(lower) clear	
	gen casorref=length(string(cas_id_or_ref_id))
	keep if type=="INV"
	drop if type_of_assignment=="Program Area" |type_of_assignment=="Unit" 
	tab casorref, m   //all referral IDs! 
	rename cas_id_or_ref_id refer_id
	drop casorref 
	tempfile investigators
	save "`investigators'"
restore 

merge 1:m refer_id using "`investigators'", gen(_mergeINV) 
tab unit_name if _mergeINV==2 //these actually seem like screener assignments 
drop if _mergeINV==2 

tab screenout _mergeINV, m 

//ISSUES TO RESOLVE 
/* If the referral was screened out, then I should not have been matching any observations.  On the contrary, if the referral was screened in, then I should have a matching investigation except for expunged cases. */ 

*Replacing the strange cases that should have been screened out to missing (only 92 instances)
foreach x in asgn_id assignment_start_dt  { 
replace `x'=. if screenout==1 & _mergeINV==3 
}
foreach x in regional_office assignment_end_dt type_of_assignment caseworker_id unit_name type { 
replace `x'="" if screenout==1 & _mergeINV==3 
}
gen notes_INV = "Not Investigated" if screenout==1 & _mergeINV==1 
replace notes_INV = "Supposed to not be Investigated but Found Assignment which I deleted" if screenout==1 & _mergeINV==3 
label var notes_INV "Any issues in Finding Investigators"
* Checking expungement cases 
	tab expunged _mergeINV, m
	tab expunged if screenout==0 & _mergeINV==1, m //3,654 (57%) of cases where could not find an investigator are expunged cases 
	replace notes_INV = "Expunged Investigation Assignments" if screenout==0 & _mergeINV==1 & expunged=="Y"   
* Cases where previously open case do not typically get an investigator assignment. Cases where re-opening the case it would depend on whether or not the case was closed less than 90 days before or not. 
tab _mergeINV if call_scrn_outcome=="Accept: Open Case (Accept for Service and Do Not Open Investigation)" | call_scrn_outcome=="Accept: **Connect to an Open Case**" |  call_scrn_outcome=="Accept: Actively working with this family" 
//setting 8 observations that were not assigned investigators but for whom I found one in the investigation files to missing 
foreach x in asgn_id assignment_start_dt { 
replace `x'=. if _mergeINV==3 & (call_scrn_outcome=="Accept: Open Case (Accept for Service and Do Not Open Investigation)" | call_scrn_outcome=="Accept: **Connect to an Open Case**" |  call_scrn_outcome=="Accept: Actively working with this family" )  
}
foreach x in regional_office assignment_end_dt type_of_assignment caseworker_id unit_name type { 
replace `x'="" if _mergeINV==3 & (call_scrn_outcome=="Accept: Open Case (Accept for Service and Do Not Open Investigation)" | call_scrn_outcome=="Accept: **Connect to an Open Case**" |  call_scrn_outcome=="Accept: Actively working with this family" ) 
}
replace notes_INV = "Open Case so no Investigator Assignment" if  (call_scrn_outcome=="Accept: Open Case (Accept for Service and Do Not Open Investigation)" | call_scrn_outcome=="Accept: **Connect to an Open Case**" |  call_scrn_outcome=="Accept: Actively working with this family" )

tab notes_INV _mergeINV , m //551 cases where I could not find an investigator assignment 
tab accept if notes_INV=="" & _mergeINV==1, m
replace notes_INV = "Missing Investigator Assignment" if _mergeINV==1 & notes_INV=="" 
duplicates drop //some of those cases had multiple investigator assignments - 31 duplicates deleted 

//NOW ACTUALLY LOOKING AT INVESTIGATION ASSIGNMENTS 

*dropping assignments that are actually screening 
gen screening_unit=(unit_name=="Call Screening 1"|unit_name=="Caseworker Supervisor/Night Intake"|unit_name=="CW Supervisor-Night/Weekend - Day"|unit_name=="CW Supervisor-Night/Weekend - Evening"|unit_name=="INTAKE 7"|unit_name=="INTAKE 8")  if unit_name!="" 
bys refer_id: egen allscreeningunit=min(screening_unit)
tab allscreeningunit
* a few referrals only have screener assignments as investigator assignments, so wipping those out 
foreach x in asgn_id assignment_start_dt  { 
replace `x'=. if allscreeningunit==1  
}
foreach x in regional_office assignment_end_dt type_of_assignment caseworker_id unit_name type { 
replace `x'="" if allscreeningunit==1  
} 
replace notes_INV = "Investigator Assignments were Actually Screening Assignments" if allscreeningunit==1  
duplicates drop //15 duplicates deleted 
* dropping all screening assignments for other cases 
distinct refer_id 
drop if screening_unit==1 & allscreening==0 //47,385 assignments deleted! 
distinct refer_id 
drop screening_unit allscreeningunit 


*dropping assignments that are in offices that do not investigate 
gen investoffice=(regional_office=="CRO" | regional_office=="ERO"| regional_office=="MVRO" | regional_office=="NRO" | regional_office=="SRO" | regional_office=="INTAKE") if unit_name!="" 
replace investoffice=0 if regional_office=="LFCO/LAO" //there's one case where unit_name is misisng but where there's this type of regional office 
bys refer_id: egen someinvest=max(investoffice) 
tab someinvest 
* a few referrals only have investigation assignments outside of regional offices, so wipping those out 
foreach x in asgn_id assignment_start_dt { 
replace `x'=. if someinvest==0 
}
foreach x in regional_office assignment_end_dt type_of_assignment caseworker_id unit_name type { 
replace `x'="" if  someinvest==0 
} 
replace notes_INV = "All Investigator Assignments were Actually not in Regional Offices nor Intake" if someinvest==0   
duplicates drop //32 duplicates deleted 
* dropping all assignments for other cases 
distinct refer_id 
drop if investoffice==0 & someinvest==1 //118 assignments deleted 
distinct refer_id 
drop investoffice someinvest  

duplicates tag refer_id, gen(dup)
replace notes_INV = "One Investigator Assignment" if dup==0 & notes_INV==""   //38,416 cases 


*Creating and Formatting date variables 
gen v=substr(assignment_end_dt,1,9)
gen assignment_end_dt_mdy=date(v, "DMY")
format assignment_end_dt_mdy %td 
drop v 
foreach x in assignment_start_dt { 
gen `x'_mdy=dofc(`x')
format `x'_mdy %td 
}
gen dayswithworker=assignment_end_dt_mdy-assignment_start_dt_mdy 
summ  dayswithworker if notes_INV=="One Investigator Assignment", detail
gen closedinvest= earliest_invest_close_dt ==assignment_end_dt_mdy
tab closedinvest if notes_INV=="One Investigator Assignment" //yes! that happens to seem to work pretty well! 99% of cases have a investigator that closed the investigation. 
tab closedinvest if dup>0 
summ dayswithworker if closedinvest==1 & dup>0, detail 

*assignments that occurred after investigation close date 
gen assigned_afterinvest= assignment_start_dt_mdy>earliest_invest_close_dt if assignment_start_dt_mdy<. & earliest_invest_close_dt<.
tab assigned_afterinvest
tab assigned_afterinvest if dup>0 
bys refer_id: egen allafterinvest=min(assigned_afterinvest)
tab allafterinvest //16 cases with one investigator assignment of 1 day 
drop if assigned_afterinvest==1 & allafterinvest==0 
drop dup allafterinvest assigned_afterinvest
duplicates tag refer_id, gen(dup)
tab notes_INV dup, m  

*lots of assignments where 1-day assignments. Keeping the non-1 day assignments  
gen samedayassignment=assignment_start_dt_mdy==assignment_end_dt_mdy
bys refer_id: egen allsameday=min(samedayassignment) 
tab allsameday if dup>0 
sort refer_id asgn_id 
*keeping the first assignment when multiple 1-day assignments 
by refer_id: drop if _n>1 & allsameday==1 //13 deleted 
replace notes_INV="First 1-day assignment" if allsameday==1 & dup>0  
drop if samedayassignment==1 & allsameday==0 & dup>0 //7,145 deleted 
drop dup samedayassignment allsameday 
duplicates tag refer_id, gen(dup) 
tab notes_INV dup, m  
replace notes_INV = "One Investigator Assignment" if dup==0 & notes_INV==""   //6,703 cases 

/* When there are multiple investigators recorded, I keep note of three investigators: 
- the first assignment => inv1
- the last assignment  =>invN
- the longest time assigned investigation  =>inv_
*/ 

sort refer_id asgn_id 
distinct refer_id if dup>0 
replace notes_INV="Choice between Several Investigators" if dup>0 
by refer_id: gen firstasgn=_n==1 if dup>0 
by refer_id: gen lastasgn=_n==_N if dup>0 
by refer_id: egen var=max(dayswithworker) if dup>0 
gen longestasgn=dayswithworker==var if dup>0 

preserve 
keep if dup==0 | (dup>0 &firstasgn==1) 
foreach x in asgn_id regional_office assignment_start_dt_mdy assignment_end_dt_mdy caseworker_id unit_name {
rename `x' `x'_inv1
}
tempfile FirstOption
save "`FirstOption'"
restore 

preserve 
keep if dup==0 | (dup>0 &longestasgn==1) 
sort refer_id asgn_id  
by refer_id: keep if _n==1 //keeping the first longest assignment if two equal lengthed assignments 
foreach x in asgn_id regional_office assignment_start_dt_mdy assignment_end_dt_mdy caseworker_id unit_name {
rename `x' `x'_inv_
}
tempfile LongOption
save "`LongOption'"
restore 

keep if dup==0 | (dup>0 & lastasgn==1) 
foreach x in asgn_id regional_office assignment_start_dt_mdy assignment_end_dt_mdy caseworker_id unit_name {
rename `x' `x'_invN
}
merge 1:1 refer_id using "`FirstOption'"
tab _merge 
merge 1:1 refer_id using "`LongOption'", gen(_merge2)
drop _merge _merge2 type_of_assignment type firstasgn lastasgn longestasgn var dup closedinvest dayswithworker _mergeINV

replace notes_INV="One Investigator Assignment" if notes_INV=="Choice between Several Investigators" & caseworker_id_inv_==caseworker_id_inv1 & caseworker_id_invN==caseworker_id_inv_ //638 had multiple investigator assignments but all of the methods point to same investigator 


***********************
* 	CLIENT REFERRALS 
***********************

**** Merging and Exploring Client-Level Case Referral Data 
drop service_decision call_scrn_outcome 

preserve 
	import excel "$data\Raw\allegations_feb2020\Referral_Clients 2008-2018", sheet("Sheet 1") firstrow case(lower) clear  //allstring
	tempfile temp 
	save "`temp'" 
restore 

merge 1:m refer_id using "`temp'"
drop _merge 
duplicates report refer_cl //unique client and refer id 



****************************************
* 	ALLEGATIONS 
****************************************

preserve 
*Creating groupings of allegations 
import excel "$data\Allegation Groupings", sheet("Sheet1") firstrow case(lower) clear 
replace code=strtrim(code) 
tempfile data1 
save "`data1'"
import excel "$data\Allegation Groupings", sheet("Sheet3") clear 
rename A code 
replace code=strtrim(code) 
rename B allegation_group
merge 1:m code using "`data1'"
drop _merge 
save "`data1'", replace 

import excel "$data\Raw\allegations_feb2020\Allegations 2008-2018", sheet("Data") firstrow case(lower) clear  
/*understanding the structure of the data - 
Each observation is a referal, client, perpetrator, allegation unit. So if there are several allegations for a child, each of these is a different row. If there are several perpetrator, each of them is a row.  */ 
*merging in allegation groupings 
replace allegation=strtrim(allegation)
merge m:1 allegation using "`data1'"
replace allegation_group="other" if _merge==1 
replace code="Other" if _merge==1 
drop _merge 

drop perp_id refer_cl call_scrn_outcome service_decision cas_id //I don't need that actually for now 
duplicates drop //42238 dropped 

tab allegation_group, gen(allegation)
tab findings 
label define alleg 0 "Not an Allegation" 1 "Allegation with No Findings" 2 "Allegation  Invalid or Unfounded" 3 "Allegation Indicated" 4 "Allegation Founded or Valid"

sort refer_id 
forvalues i=1/22 { 
replace allegation`i'=2 if (findings=="Invalid" | findings=="Unfounded") & allegation`i'==1 
replace allegation`i'=3 if findings=="Indicated" & allegation`i'==1 
replace allegation`i'=4 if (findings=="Founded"|findings=="Valid") & allegation`i'==1 
by refer_id : egen vallegation`i'=max(allegation`i')
local name: variable label allegation`i' 
dis "`name'"
drop allegation`i'
rename vallegation`i' allegation`i'
label var allegation`i' "`name'"
label val allegation`i' alleg 
}

keep refer_id allegation*
drop allegation allegation_group 
duplicates drop 
save "`data1'", replace 
restore 

merge m:1 refer_id  using "`data1'" //to have information about allegations on the referral, I grouped together all allegations for different children and different perpetrators 

drop _merge 
save "`data1'", replace 


*********************
* Preparing Placements Data 
*********************


import excel "$data\Raw\allegheny_raw_25nov19\Placements_All_RelationAppend", sheet("Placements") firstrow case(lower) clear  

*some data cleaning/preparation 
drop cl_dob cl_sex cl_ethn_desc cl_race_desc cl_race_group mci_id view_build_date //plcmnt_type* 

gen plcmnt_entry_mdy=dofc(plcmnt_entry_dt)
format plcmnt_entry_mdy %td 
gen plcmnt_exit_dt2=clock(plcmnt_exit_dt, "MDYhm" )
gen plcmnt_exit_mdy=dofc(plcmnt_exit_dt2)
format plcmnt_exit_mdy %td 
foreach x in entry exit {
	gen rmvl_`x'_dt_mdy=dofc(rmvl_`x'_dt)
	format rmvl_`x'_dt_mdy %td 
}


bys client_id rmvl_seq_nbr (plcmnt_seq_nbr): keep if _n==1 //that means that all the rem_ variables (removal reasons) will be for the first placement
keep client_id case_id rmvl_seq_nbr rmvl_in_care rmvl_los_days rmvl_rtrn_rsn* rmvl_entry_dt_mdy rmvl_exit_dt_mdy rmvl_type_amend 
rename case_id cas_id 
rename client_id cl_id 
rename cas_id placement_cas_id 
tempfile placements 
save "`placements'"

*********************
* Merging Placement Data to Referrals File 
********************* 
use "`data1'", clear 
duplicates report refer_id cl_id //573040 observations 
distinct refer_id //114369 
distinct cl_id //257913

joinby cl_id using "`placements'", unmatched(both)
tab _merge 
gen rmvl_yr=year(rmvl_entry_dt_mdy)
tab rmvl_yr if _merge==2, m 

drop if _merge==2 
drop _merge 

*Figure out for each referral, if that placement was relevant 
gen intake_1ylater=intake_dt_mdy+365
format intake_1ylater %td
gen intake_3mlater=intake_dt_mdy+90
format intake_3mlater %td

gen within3m= (intake_dt_mdy<=rmvl_entry_dt_mdy & rmvl_entry_dt_mdy<=intake_3mlater & rmvl_entry_dt_mdy<. &(rmvl_type_amend=="BOTH"|rmvl_type_amend=="CYF"))  
gen within1y= (intake_dt_mdy<=rmvl_entry_dt_mdy & rmvl_entry_dt_mdy<=intake_1ylater & rmvl_entry_dt_mdy<.&(rmvl_type_amend=="BOTH"|rmvl_type_amend=="CYF")) 
tab within3m within1y, m  
foreach x in  within3m within1y { 
    bys refer_id cl_id (rmvl_seq_nbr): egen ever`x'=max(`x')
}


*The above variables will only be generated when I matched to some removal data. Replacing the variables when case was screened in but no matching removal.  
foreach x in 3m 1y {
replace within`x'=0 if within`x'==. & screenin==1 
}

*Updating placement variables to keep only placement details where removal within 1y 
foreach x in  rmvl_entry_dt_mdy rmvl_exit_dt_mdy rmvl_los_days  { 
replace `x'=. if within1y==0 
}
//rmvl_entry_dt rmvl_mo rmvl_yr rmvl_exit_dt     court_order runaway fprequest crequest fpcope harm rem_abandon rem_chld_beh rem_neg_educ  rem_neg_nutr rem_resump    rem_unk rem_alc_adlt  rem_chld_bh   rem_dom_viol  rem_emot_inj  rem_neg_hyg   rem_neg_sfty  rem_prnt_bh   rem_sex_abus rem_alc_chld   rem_incarc    rem_neg_med   rem_neg_shel  rem_death_prnt rem_drug_chld rem_phys_abus rem_prnt_cope rem_prnt_skls rem_chld_disab rem_drug_adlt rem_prnt_chld_conf rem_relinq    rem_truancy ref_ishospitalization
foreach x in  rmvl_rtrn_rsn rmvl_in_care rmvl_type_amend { 
    replace `x'="" if within1y==0 
}

*keeping the first removal within 3m 
drop if within3m!=1 & everwithin3m==1 
bys refer_id cl_id (rmvl_seq_nbr): drop if within3m==1 & _n>1 

*keeping the first removal within 1y 
drop if within1y!=1 & everwithin1y==1 
bys refer_id cl_id (rmvl_seq_nbr): drop if within1y==1 & _n>1 

*now can keep the first for everyone given that I assured that I only have one refer-client pair for cases where I need to keep removal/placement data information 
bys refer_id cl_id (rmvl_seq_nbr): drop if _n>1 
distinct refer_cl //great! 
drop everwithin3m everwithin1y intake_1ylater intake_3mlater 
label var within3m "First Removal within 3m of Intake date"
label var within1y "First Removal within 1y of Intake date"


save "$data\Processed\ReferralClient.dta", replace 

